Problem 1

1… 10

11

11.a A movie should appear in the dataset at least 18 times. Each has a record for the weekend (Friday, Saturday and Sunday) from the opening weekend to at least 6 weekends later (for the ones kept). The ones dropped were not in theaters for more than 6 weekends.

11.b

#keeping films that aren't dropped
films_used <- films |> 
  filter(dropped != 1)

11.c

# day when 12 Rounds came in
round_12_date <- as.Date("2009-03-27")

# Define the number of days to add
days_before <- 17984 #number under 12 Rounds "date" column

# Days prior to the 
reference_date <- round_12_date - days_before + 1

# Print the new date
print(reference_date)
## [1] "1960-01-01"

11.d

films_used_d <- films_used |> 
  mutate(movie_date = as.Date(reference_date + date)) |> 
  #putting the release_date in the 4th column
  select(title, production_budget, release_yr,
         movie_date, sat_date, everything())

films_used_d[, c("title", "movie_date")]

11.e

#first using sat_date to get the date for each saturday
films_used_date <- films_used_d |>
  #getting the day for saturday
  mutate(sat_day = reference_date + sat_date) |>
  mutate(sat_day_of_week = wday(sat_day, label = TRUE)) |>
  mutate(
    fri_dummy = ifelse(movie_date == sat_day - 1, 1, 0),
    sat_dummy = ifelse(movie_date == sat_day , 1, 0),
    #reasoning... there was no movie released on Sunday....
    sun_dummy = ifelse(movie_date == sat_day + 1, 1, 0) 
  ) |> arrange(title)

films_used_date[, c("title", "movie_date","sat_day" ,"fri_dummy", "sat_dummy", "sun_dummy")]

11.f

#creating dummies for week using fastDummies
films_used_date <- films_used_date |>  
  arrange(title, sat_day) |> 
  group_by(title) |> 
  # Assign numeric labels to unique elements of sat_date within each title
  mutate(week = as.integer(factor(sat_date)))


#Now using fast dummies...
films_used_date <- dummy_cols(films_used_date, select_columns = 'week')
films_used_date[, c("title", "movie_date" ,"week_1", "week_2", "week_3")]

11.g

#using the "Fast Dummies" library... to automatically create dummies for year
film <- dummy_cols(films_used_date, select_columns = 'release_yr')

film[, c("title", "release_yr", "release_yr_2009", "release_yr_2010")]

11.h

#combine the weekends 
temp <- film |> 
 mutate(weekend = case_when(
   sat_dummy == 1 ~ "Saturday",
   fri_dummy == 1 ~ "Friday",
   sun_dummy == 1 ~ "Sunday",
 )) |> 
  group_by(week, weekend) |> 
  summarize(mean = mean(tickets, na.rm = TRUE))

temp |> 
  ggplot(aes(x = week, y = mean, color = as.factor(weekend))) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = c("Saturday" = "#4682B4", 
                               "Friday" = "red", 
                               "Sunday" = "#8B008B")) +
  labs(color = "Weekend",
       y = "Tickets",
       x = "Week") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 6)) +  # Set x-axis ticks
  scale_y_continuous(breaks = scales::pretty_breaks(n = 6)) +  # Set y-axis ticks
  theme_bw()

12

NOT NEEDED

13

#subset colnames that have the hh in them
holiday <- str_subset(colnames(film), "hh")

#make the things in holiday "add"
holiday_dummy <- str_c(holiday, collapse = " + ")

#day of the week dummies
weekend_dummy <- str_c(str_subset(colnames(film), "dummy"), collapse = " + ")

#week of the year dummies
week_dummy <- str_c(str_subset(colnames(film), "week_"), collapse = " + ")

#year of the week dummy
year_dummy <- str_c(str_subset(colnames(film), "release_yr_"), collapse = " + ")

#combine
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")

#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = film)


film <- film |> 
  mutate(pred_tickets = predict(reg_mod1, film)) |> 
  mutate(abnormal_viewership = tickets - pred_tickets)

film[, c("tickets","pred_tickets", "abnormal_viewership", "sat_day")]

14

weather <- read_dta("data/weather_collapsed_day.dta")

#adding www to the column names
original_cols <- colnames(weather) 

# adding prefix using the paste 
colnames(weather) <- paste("www", original_cols, sep = "_") 

weather
weather_film <- film |> 
  left_join(weather,
            #combine on dates, automatically filters out dates that don't match
            by = c("movie_date" = "www_date",
                   "sat_day" = "www_sat_date"))

weather_film |> 
  select(movie_date, sat_day, contains("www"))

15

# Select columns with names containing "www_"
www_columns <- str_subset(colnames(weather_film), "www_")

# Create a copy of the original dataframe
df <- weather_film 

# Define regression formula with dummy variables
regressors <-  glue("~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")

# Iterate over columns with names containing "www_"
for (columns in www_columns) {
  # Construct regression formula
  model <- paste(columns, regressors)
  
  # Generate names for predicted values and residuals
  pred_name <- paste("pred", columns, sep = "_")
  resid_name <- paste("abnormal", columns, sep = "_")
  
  # Add predicted values and residuals to the dataframe
  df <- df |> 
      mutate(!!pred_name := predict(lm(as.formula(model), data = df), df)) |> 
    #residuals = column - predicted_value_for_column
      mutate(!!resid_name := eval(parse(text = columns)) - eval(parse(text = pred_name)))
}

#remove the predicted and original values, keeping only the residuals
new_weather <- df |> 
  select(-c(contains("pred_www"), starts_with("www")))

16

#combine
#fit a regression model
week_2_data <- new_weather |> 
  filter(week_2 == 1)

#using the same regression 
reg_mod2 <- lm(as.formula(mod1), data = week_2_data)


new_weather_film_wk2 <- week_2_data |> 
  mutate(pred_tickets_wk_2 = predict(reg_mod2, week_2_data)) |> 
  mutate(abnormal_viewership_wk_2 = tickets - pred_tickets_wk_2)


new_weather_film_wk2[, c("tickets", "pred_tickets_wk_2", "week_2", "abnormal_viewership_wk_2")]

17

#Mak
#subsetting the data to just be week 1
week_1_data <- new_weather |> 
  filter(week_1 == 1)

#creating the "abnormal viewerships in week 1"------------
mod1 <- glue("tickets ~ {weekend_dummy} + {week_dummy} + {year_dummy} + {holiday_dummy}")

#fit a regression model
reg_mod1 <- lm(as.formula(mod1), data = week_1_data)

new_weather_film_wk1 <- week_1_data |> 
  mutate(pred_tickets_wk_1 = predict(reg_mod1, week_1_data)) |> 
  mutate(abnormal_viewership_wk1 = tickets - pred_tickets_wk_1)

17.a OLS;

abnormal_weather_wk1_names <-
  str_subset(colnames(new_weather_film_wk1), "abnormal_www")

abnormal_weather_wk1 <-
  str_c(abnormal_weather_wk1_names, collapse = "+")

ols_glue <- glue("abnormal_viewership_wk1 ~ {abnormal_weather_wk1}")
ols_mod <- lm(as.formula(ols_glue),
     new_weather_film_wk1)

#modelsummary(list(ols_mod), output = "gt")

17.b

#subset the data to include the variables of interest
leaps_data <- new_weather_film_wk1 |> 
  select(c(abnormal_viewership_wk1, all_of(abnormal_weather_wk1_names)))

forward <- regsubsets(abnormal_viewership_wk1 ~ ., 
           data = leaps_data, method = "forward")
## Reordering variables and trying again:
# Get summary of the models
summary_forward <- summary(forward)

# Find the index of the model with the highest R-squared Adjusted
best_model_index_fwd <- which.max(summary_forward$adjr2) #9th model has the highest

# Get the names of predictors (coef) in the best model (9), without the intercept([-1])
best_adjr_predictors <- names(coef(forward, id = best_model_index_fwd)[-1])

# Print the selected predictors and the corresponding R-squared Adjusted value
best_adjr_predictors
## [1] "abnormal_www_rain"    "abnormal_www_mat5_60" "abnormal_www_mat5_85"
## [4] "abnormal_www_mat5_90" "abnormal_www_prec_1"  "abnormal_www_cloud_0"
## [7] "abnormal_www_cloud_4" "abnormal_www_cloud_5" "abnormal_www_cloud_8"
#running regressions based on the model from foward (adj R^2)
regs_fwd <- str_c(best_adjr_predictors, collapse = " + ")

fwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_fwd}")

fwd_mod <- lm(as.formula(fwd_glue), data = new_weather_film_wk1)

17.c

#only show the last steps (trace = 0)
backward <- step(ols_mod, direction = "backward",trace=0)
best_bkwd_predictors <- names(coefficients(backward)[-1])

best_bkwd_predictors
## [1] "abnormal_www_rain"    "abnormal_www_mat5_45" "abnormal_www_mat5_55"
## [4] "abnormal_www_mat5_70" "abnormal_www_mat5_75" "abnormal_www_prec_0" 
## [7] "abnormal_www_cloud_3" "abnormal_www_cloud_4" "abnormal_www_mat_la"
#running regressions based on the model from backward
regs_bkwd <- str_c(best_bkwd_predictors, collapse = " + ")

bkwd_glue <- glue("abnormal_viewership_wk1 ~ {regs_bkwd}")

bkwd_mod <- lm(as.formula(bkwd_glue), data = new_weather_film_wk1)

17.d

lasso_mod <- cv.glmnet(
  x = as.matrix(new_weather_film_wk1 |>
                   select(all_of(abnormal_weather_wk1_names))),
  y = new_weather_film_wk1 |>
    pull(abnormal_viewership_wk1), #pull gets the numeric values
  alpha = 1, # Lasso penalty
  nfolds = 5 # 5 fold cross validation
)

new_weather_film_wk1 |> 
  mutate(pred = predict(lasso_mod, as.matrix(new_weather_film_wk1 |>
                   select(all_of(abnormal_weather_wk1_names))), s = "lambda.min")) 

21

#movies <- read_csv("data/movie_lens_20m/movie.csv")
#ratings <- read_csv("data/movie_lens_20m/rating.csv")